##############################################################
### Project: Where do parties talk about what? Party issue ###
###          salience across communication channels        ###
### Task:    Analysis - Preparation                        ###
### Title:   03_Analysis_Preparation.R                     ###
##############################################################

#--------------------------------------------------------------------------------------------------
# Description:
#
# This script creates the main data set used for the statistical analysis. The following lines of
# code produce the dependent and independent variables for the estimated regression models in the
# paper.
#--------------------------------------------------------------------------------------------------

#---------------------------------------------------------------------------------------------------------------------

# Preparation

## load packages
library(tidyverse)
library(xlsx)
library(readxl)
library(manifestoR)
library(zoo)
library(lubridate)
library(stringi)

#---------------------------------------------------------------------------------------------------------------------

# Load and prepare classified text data

## Press releases
data_pressreleases <- read.csv("Results_Pressreleases.csv", encoding = "UTF-8")
data_pressreleases$issue <- gsub("_", " ", data_pressreleases$issue)
data_pressreleases$month <- as.numeric(gsub("-", "",
                                            str_extract(data_pressreleases$date, "[[:digit:]]{4,4}-[[:digit:]]{2,2}")))
data_pressreleases$quarter <- quarter(as.Date(data_pressreleases$date), with_year = TRUE)
data_pressreleases$party <- gsub("EVP/PEV", "Mitte", data_pressreleases$party)
data_pressreleases$party[data_pressreleases$month > 201909] <- gsub("BDP/PBD", "Mitte",
                                 data_pressreleases$party[data_pressreleases$month > 201909])
data_pressreleases$party <- gsub("CVP/PDC", "Mitte", data_pressreleases$party)

## Tweets (Parties)
data_tweets_party <- read.csv("Results_TweetsParty.csv", encoding = "UTF-8")
data_tweets_party$issue <- gsub("_", " ", data_tweets_party$issue)
data_tweets_party$month <- as.numeric(gsub("-", "",
                                           str_extract(data_tweets_party$created_at, "[[:digit:]]{4,4}-[[:digit:]]{2,2}")))
data_tweets_party$quarter <- quarter(as.Date(data_tweets_party$created_at), with_year = TRUE)
data_tweets_party$party <- gsub("EVP/PEV", "Mitte", data_tweets_party$party)
data_tweets_party$party[data_tweets_party$month > 201909] <- gsub("BDP/PBD", "Mitte",
                                  data_tweets_party$party[data_tweets_party$month > 201909])
data_tweets_party$party <- gsub("CVP/PDC", "Mitte", data_tweets_party$party)

## Tweets (Persons)
data_tweets_person <- read.csv("Results_TweetsPerson.csv", encoding = "UTF-8")
data_tweets_person$issue <- gsub("_", " ", data_tweets_person$issue)
data_tweets_person$month <- as.numeric(gsub("-", "",
                                           str_extract(data_tweets_person$date, "[[:digit:]]{4,4}-[[:digit:]]{2,2}")))
data_tweets_person$quarter <- quarter(as.Date(data_tweets_person$date), with_year = TRUE)
data_tweets_person$party <- gsub("EVP/PEV", "Mitte", data_tweets_person$party)
data_tweets_person$party[data_tweets_person$month > 201909] <- gsub("BDP/PBD", "Mitte",
                                  data_tweets_person$party[data_tweets_person$month > 201909])
data_tweets_person$party <- gsub("CVP/PDC", "Mitte", data_tweets_person$party)

## Parliamentary speeches
data_parlspeeches <- read.csv("Results_ParlSpeeches.csv", encoding = "UTF-8")
data_parlspeeches$issue <- gsub("_", " ", data_parlspeeches$issue)
data_parlspeeches$date[data_parlspeeches$country == "AT"] <- gsub("2019-05-20", "2021-05-20",
                                  data_parlspeeches$date[data_parlspeeches$country == "AT"])
data_parlspeeches$month <- as.numeric(gsub("-", "",
                                            str_extract(data_parlspeeches$date, "[[:digit:]]{4,4}-[[:digit:]]{2,2}")))
data_parlspeeches$quarter <- quarter(as.Date(data_parlspeeches$date), with_year = TRUE)
data_parlspeeches$party <- gsub("EVP/PEV", "Mitte", data_parlspeeches$party)
data_parlspeeches$party[data_parlspeeches$month > 201909] <- gsub("BDP/PBD", "Mitte",
                                  data_parlspeeches$party[data_parlspeeches$month > 201909])
data_parlspeeches$party <- gsub("CVP/PDC", "Mitte", data_parlspeeches$party)
data_parlspeeches$party <- gsub("PILZ/JETZT", "JETZT", data_parlspeeches$party)
data_parlspeeches$party <- gsub("JETZT", "PILZ/JETZT", data_parlspeeches$party)

#---------------------------------------------------------------------------------------------------------------------

# Load coded manifestos from Manifesto Project

mp_setapikey(key.file = NULL, key = "") # provide own Manifesto Project API key
get_corpus <- function(x) {
  manifesto_corpus <- c()
  for(i in 1:length(x)) {
    res_tmp <- cbind(id = x[[i]][[2]]$manifesto_id,
                     party_id = x[[i]][[2]]$party,
                     date = x[[i]][[2]]$date,
                     language = x[[i]][[2]]$language,
                     annotation = x[[i]][[2]]$annotations,
                     x[[i]][[1]])
    manifesto_corpus <- rbind(manifesto_corpus, res_tmp)
  }
  return(manifesto_corpus)
}

## Austria
manifestos_AT2017 <- mp_corpus(countryname == "Austria" & date == "201710")
manifesto_corpus_AT2017 <- get_corpus(manifestos_AT2017)
manifestos_AT2019 <- mp_corpus(countryname == "Austria" & date == "201909")
manifesto_corpus_AT2019 <- get_corpus(manifestos_AT2019)

manifesto_corpus_AT <- rbind(manifesto_corpus_AT2017, manifesto_corpus_AT2019)
manifesto_corpus_AT$country <- "AT"

## Switzerland
manifestos_CH2015 <- mp_corpus(countryname == "Switzerland" & date == "201510")
manifesto_corpus_CH2015 <- get_corpus(manifestos_CH2015)
manifestos_CH2019 <- mp_corpus(countryname == "Switzerland" & date == "201910")
manifesto_corpus_CH2019 <- get_corpus(manifestos_CH2019)

manifesto_corpus_CH <- rbind(manifesto_corpus_CH2015, manifesto_corpus_CH2019)
manifesto_corpus_CH$country <- "CH"

## Germany
manifestos_DE2017 <- mp_corpus(countryname == "Germany" & date == "201709")
manifesto_corpus_DE2017 <- get_corpus(manifestos_DE2017)
manifestos_DE2021 <- mp_corpus(countryname == "Germany" & date == "202109")
manifesto_corpus_DE2021 <- get_corpus(manifestos_DE2021)

manifesto_corpus_DE <- rbind(manifesto_corpus_DE2017, manifesto_corpus_DE2021)
manifesto_corpus_DE$country <- "DE"

## finalise manifesto corpus
manifesto_corpus <- rbind(manifesto_corpus_AT, manifesto_corpus_CH, manifesto_corpus_DE) %>%
  filter(language == "german") %>%
  filter(!is.na(cmp_code) & cmp_code != "H" & cmp_code != "000")

df_partynames <- data.frame(party_id = c(42110, 42320, 42420, 42430, 42520, 42120,
                                         43110, 43120, 43220, 43320, 43420, 43520, 43530, 43810,
                                         43811, 43902, 43020, 43711, 43901,
                                         41113, 41223, 41320, 41420, 41521, 41953),
                            party  = c("Grüne/GA", "SPÖ", "FPÖ/VDU", "NEOS", "ÖVP", "PILZ/JETZT",
                                       "GPS/PES", "GLP", "PdAS/PdTS", "SPS/PSS", "FDP/PLR",
                                       "CVP/PDC", "EVP/PEV", "SVP/UDC",
                                       "BDP/PBD", "MCG", "EAG", "EDU/UDF", "LdT",
                                       "Bündnis90/Die Grünen", "Linke/PDS", "SPD",
                                       "FDP", "CDU/CSU", "AfD"))
manifesto_corpus <- left_join(manifesto_corpus, df_partynames, by = "party_id")
manifesto_corpus <- manifesto_corpus[, c(1:2, 9:10, 3, 6:7)]
codebook <- read.xlsx("Codebook.xlsx", sheetIndex = 1)
codes <- codebook[,5:6]
colnames(codes) <- c("issue", "cmp_code")
manifesto_corpus <- left_join(manifesto_corpus, codes, by="cmp_code")

#---------------------------------------------------------------------------------------------------------------------

# save text data sets
save(data_pressreleases, data_parlspeeches, data_tweets_party, data_tweets_person, manifesto_corpus,
     file = "Text_Data.RDA")

#---------------------------------------------------------------------------------------------------------------------

# Filter data

# Info: custom function to filter out parties with less than 10 press releases, tweets or
#       parliamentary speeches per quarter

filter_data <- function(data, min_num) {
  x <- data.frame(table(data$party, data$quarter,
                        dnn = c("party", "quarter"))) %>%
    mutate(drop = Freq < min_num)
  x$quarter <- as.numeric(as.character(x$quarter))

  out <- left_join(data, x[, c(1:2, 4)]) %>%
    filter(drop == FALSE)
  out$drop <- NULL
  
  return(out)
}

data_pressreleases <- filter_data(data = data_pressreleases, min_num = 10)
data_tweets_party <- filter_data(data = data_tweets_party, min_num = 10)
data_tweets_person <- filter_data(data = data_tweets_person, min_num = 10)
data_parlspeeches <- filter_data(data = data_parlspeeches, min_num = 10)

#---------------------------------------------------------------------------------------------------------------------

# Calculate manifesto agendas

df_manifestos <- data.frame(table(manifesto_corpus$issue, manifesto_corpus$id, dnn = c("issue", "id"))) %>%
  left_join(unique(manifesto_corpus[, 1:5]), by = "id") %>%
  unique() %>%
  dplyr::group_by(id) %>%
  mutate(manifesto_salience = Freq/sum(Freq)*100)
df_manifestos <- df_manifestos[, c(5:7, 1, 8)]
colnames(df_manifestos)[3] <- "relevant_manifesto"

df_manifestos$party <- gsub("CVP/PDC", "Mitte", df_manifestos$party) #use "CVP/PDC" as representative manifesto for "Mitte"

#---------------------------------------------------------------------------------------------------------------------

# Calculate press release, tweets and parl. speech agendas

## custom function to calculate agendas per party and quarter for each text type
get_agendas <- function(data, type) {
  out <- c()
  for(i in 1:length(unique(df_manifestos$party))) {
    party_tmp <- unique(df_manifestos$party)[i]
    df_tmp <- filter(data, party == party_tmp)
    if(nrow(df_tmp) == 0) {
      next
    }
    res1 <- data.frame(issue = rep(unique(df_manifestos$issue), times = length(unique(df_tmp$quarter))),
                       quarter = rep(unique(df_tmp$quarter), each = length(unique(df_manifestos$issue))))
    res2 <- data.frame(table(df_tmp$issue, df_tmp$quarter, dnn = c("issue", "quarter"))) %>%
      group_by(quarter) %>%
      mutate(salience = Freq/sum(Freq)*100)
    res2$quarter <- as.numeric(as.character(res2$quarter))
    res <- left_join(res1, res2)
    res$country <- unique(df_tmp$country)
    res$party <- party_tmp
    res$quarter <- as.numeric(as.character(res$quarter))
    res$type <- type
    res <- res[, c(5:6, 1, 7, 2, 4)]
    res$salience[is.na(res$salience)] <- 0
    out <- rbind(out, res)
  }
  return(out)
}

df_pressreleases <- get_agendas(data = data_pressreleases, type = "Press releases")
df_tweets_party <- get_agendas(data = data_tweets_party, type = "Tweets (party)")
df_tweets_person <- get_agendas(data = data_tweets_person, type = "Tweets (person)")
df_parlspeeches <- get_agendas(data = data_parlspeeches, type = "Parl. speeches")

## add info on relevant manifesto
add_manifesto_info <- function(data) {
  
  data$relevant_manifesto <- NA
  
  data$relevant_manifesto[data$country == "AT" & data$quarter < 2019.3] <- 201710
  data$relevant_manifesto[data$country == "AT" & data$quarter >= 2019.3] <- 201909
  
  data$relevant_manifesto[data$country == "CH" & data$quarter < 2019.4] <- 201510
  data$relevant_manifesto[data$country == "CH" & data$quarter >= 2019.4] <- 201910
  
  data$relevant_manifesto[data$country == "DE" & data$quarter < 2021.3] <- 201709
  data$relevant_manifesto[data$country == "DE" & data$quarter >= 2021.3] <- 202109
  
  out <- data
  return(out)
}

df_pressreleases <- add_manifesto_info(data = df_pressreleases)
df_tweets_party <- add_manifesto_info(data = df_tweets_party)
df_tweets_person <- add_manifesto_info(data = df_tweets_person)
df_parlspeeches <- add_manifesto_info(data = df_parlspeeches)

#---------------------------------------------------------------------------------------------------------------------

# Create main data set

## basis of main data set
df <- data.frame(
  party = unique(df_manifestos$party),
  issue = rep(unique(df_manifestos$issue), each = length(unique(df_manifestos$party))),
  quarter = rep(unique(df_pressreleases$quarter), each = length(unique(df_manifestos$party))*length(unique(df_manifestos$issue)))
)

df <- left_join(df, unique(df_manifestos[, 1:2]))
df <- df[, c(4, 1:3)]

df$relevant_manifesto[df$country == "AT" & df$quarter < 2019.3] <- 201710
df$relevant_manifesto[df$country == "AT" & df$quarter >= 2019.3] <- 201909

df$relevant_manifesto[df$country == "CH" & df$quarter < 2019.4] <- 201510
df$relevant_manifesto[df$country == "CH" & df$quarter >= 2019.4] <- 201910

df$relevant_manifesto[df$country == "DE" & df$quarter < 2021.3] <- 201709
df$relevant_manifesto[df$country == "DE" & df$quarter >= 2021.3] <- 202109

## merge text type data frames into main data set
df_full_1 <- left_join(df, df_manifestos) %>%
  left_join(df_pressreleases)

df_full_2 <- left_join(df, df_manifestos) %>%
  left_join(df_tweets_party)

df_full_3 <- left_join(df, df_manifestos) %>%
  left_join(df_tweets_person)

df_full_4 <- left_join(df, df_manifestos) %>%
  left_join(df_parlspeeches)

df_full <- rbind(df_full_1, df_full_2, df_full_3, df_full_4) %>%
  filter(!is.na(type)) %>%
  filter(!is.na(manifesto_salience))

df_full$type <- as.factor(df_full$type) %>%
  relevel(ref = "Press releases")

#---------------------------------------------------------------------------------------------------------------------

# Additional meta data

## Time since last election

df_full$relevant_manifesto_quarter <- quarter(
  paste0(
    stri_extract(df_full$relevant_manifesto, regex = "[[:digit:]]{4,4}"), 
    "-", 
    stri_extract_last(df_full$relevant_manifesto, regex =  "[[:digit:]]{2,2}"), 
    "-01"),
  with_year = TRUE
)

df_full$time_since_last_election <- (as.yearqtr(as.character(df_full$quarter), format = "%Y.%q") -
  as.yearqtr(as.character(df_full$relevant_manifesto_quarter), format = "%Y.%q")) * 4

## Government/opposition parties

df_full$government <- 0

### Austria
idx_at_1 <- which(df_full$party == "ÖVP" & df_full$quarter <= 2019.2)
idx_at_2 <- which(df_full$party == "ÖVP" & df_full$quarter >= 2020.0)
idx_at_3 <- which(df_full$party == "FPÖ/VDU" & df_full$quarter <= 2019.2)
idx_at_4 <- which(df_full$party == "Grüne/GA" & df_full$quarter >= 2020.0)
idx_at <- sort(c(idx_at_1, idx_at_2, idx_at_3, idx_at_4))
df_full$government[idx_at] <- 1

### Germany
idx_de <- which(df_full$party == "SPD" | df_full$party == "CDU/CSU")
df_full$government[idx_de] <- 1

### Switzerland
idx_ch <- which(df_full$party == "Mitte" | df_full$party == "SPS/PSS" |
                  df_full$party == "FDP/PLR" | df_full$party == "SVP/UDC")
df_full$government[idx_ch] <- 1

## Mainstream/Niche parties

df_full$niche_party <- 0
idx_niche <- which(df_full$party == "AfD" | df_full$party == "Bündnis90/Die Grünen" |
                     df_full$party == "FPÖ/VDU" | df_full$party == "GLP" |
                     df_full$party == "GPS/PES" | df_full$party == "Grüne/GA" |
                     df_full$party == "Bündnis90/Die Grünen" | df_full$party == "PILZ/JETZT" |
                     df_full$party == "SVP/UDC")
df_full$niche_party[idx_niche] <- 1

## Campaign

df_full$campaign <- 0

### Austria
idx_at <- which(df_full$quarter == 2019.3)
df_full$campaign[idx_at] <- 1

### Germany
idx_de <- which(df_full$quarter == 2021.3)
df_full$campaign[idx_de] <- 1

### Switzerland
idx_ch <- which(df_full$quarter == 2019.4)
df_full$campaign[idx_ch] <- 1

#---------------------------------------------------------------------------------------------------------------------

# Save main data set

saveRDS(df_full, "Main_Data.RDS")

